clear
global data "C:\Users\taiwo\Downloads\Nigeria Data\repdata\"
cd "$data"
import excel "$data\Projects_phase.xlsx", sheet("Sheet1") firstrow case(lower)
encode fuel, generate(fuel_n)
tab  fuel_n
bro fuel_n
bro fuel
bro fuel_n
recode  fuel_n (1/2=0)(3/5=1)(6=0)(7/14=1)(15=0)(16/27=1)(28/29=0)(30/31=1)(32=0)(33=1)(34=0), gen (nonrenewable)
tab status
gen status_rank = .
replace status_rank = 1 if status == "Operating"
replace status_rank = 2 if status == "Planned"
replace status_rank = 3 if status == "Construction"
replace status_rank = 4 if status == "No evidence of progress"
replace status_rank = 5 if status == "Ceased operations"
replace status_rank = 6 if status == "Abandoned"
replace status_rank = 7 if status == "Rehabilitation"

*getting power plants with operating years*
gen year=year(dofc(commercialoperationsdatedate))

* Collapse data: sum installed capacities and keep the preferred status
collapse (sum)  installedcapacitymw (min) status_rank (min) year (max)  nonrenewable, by(projectid)

* Map the status rank back to status text
gen status_phase = ""
replace status_phase = "Operating" if status_rank == 1 
replace status_phase = "Planned" if status_rank == 2 
replace status_phase = "Construction" if status_rank ==3 
replace status_phase = "No evidence of progress" if status_rank == 4 
replace status_phase = "Ceased operations" if status_rank == 5 
replace status_phase = "Abandoned" if status_rank == 6

save "$dataprojectphase.dta", replace

clear
import excel "$data\Projects.xlsx", sheet("Sheet1") firstrow case(lower)
rename id projectid
merge 1:1 projectid using "$data\projectphase.dta"
drop _merge
export excel using "$data\PP061824.xls", firstrow(variables) replace

***export to excel to join power plants to clusters

***import excel containing power plants and clusters

clear

import excel "$data\Nonoperatingclusters_TableToExcel.xlsx", sheet("Nonoperatingclusters") firstrow case(lower)
rename projectid projectid_non
rename coordinates coordinates_non
rename f3 f3_non
rename geojsonexact geojsonexact_non
rename projectname projectname_non
rename status status_non
rename ownershiptype ownershiptype_non
rename connectiontype connectiontype_non
rename installedcapacitymw installedcapacitymw_non
rename status_rank status_rank_non
rename year year_non
rename nonrenewable nonrenewable_non
rename status_phase status_phase_non
browse
rename distance distance_non
save "$data\nonoperatingclusters.dta", replace

clear
import excel "$data\Operatingclusters_TableToExcel.xlsx", sheet("Operatingclusters") firstrow case(lower)
merge 1:1 objectid dhsid dhscc dhsyear dhsclust latnum longnum using "$data\nonoperatingclusters.dta"
drop _merge


***focusing on households within 20km of power plants
gen treat_20=1 if distance<20000
replace treat_20=0 if distance_non<20000 & distance>20000
drop if treat_20==.
replace distance = distance_non if treat_20==0 
replace projectid= projectid_non if treat_20==0 
replace coordinates = coordinates_non if treat_20==0 
replace f3 = f3_non if treat_20==0 
replace geojsonexact = geojsonexact_non if treat_20==0 
replace projectname = projectname_non if treat_20==0 
replace status= status_non if treat_20==0 
replace ownershiptype = ownershiptype_non if treat_20==0 
replace connectiontype = connectiontype_non if treat_20==0 
replace installedcapacitymw = installedcapacitymw_non if treat_20==0 
replace status_rank = status_rank_non if treat_20==0 
destring year_non, generate(year_non_new)
replace year = year_non_new if treat_20==0 
replace nonrenewable = nonrenewable_non if treat_20==0 
replace status_phase = status_phase_non if treat_20==0 


tab treat_20
drop distance_non projectid_non coordinates_non f3_non geojsonexact_non projectname_non status_non ownershiptype_non connectiontype_non installedcapacitymw_non status_rank_non year_non year_non_new nonrenewable_non status_phase_non
save "$data\clusters_within_20km.dta", replace

***geocovariates for all
import delimited "$data\NGGC81FL2021\NGGC81FL\NGGC81FL.csv", bindquote(strict) clear 
save "$data\2021_covariates.dta", replace
import delimited "$data\NGGC7BFL2018\NGGC7BFL.csv", bindquote(strict) clear 
save "$data\2018_covariates.dta",replace
import delimited "$data\NGGC6BFL2013\NGGC6BFL.csv", bindquote(strict) clear 
save "$data\2013_covariates.dta", replace
import delimited "$data\NGGC72FL2015\NGGC72FL.csv", bindquote(strict) clear 
save "$data\2015_covariates.dta", replace
import delimited "$data\NGGC62FL2010\NGGC62FL.csv", bindquote(strict) clear 
save "$data\2010_covariates.dta", replace
import delimited "$data\NGGC52FL2008\NGGC52FL.csv", bindquote(strict) clear 
save "$data\2008_covariates.dta", replace
import delimited "$data\NGGC4BFL2003\NGGC4BFL.csv", bindquote(strict) clear 
save "$data\2003_covariates.dta", replace
append using "$data\2008_covariates.dta" "$data\2010_covariates.dta" "$data\2013_covariates.dta" "$data\2015_covariates.dta" "$data\2018_covariates.dta" "$data\2021_covariates.dta", force
tab dhsyear
save "$data\allyr_covariates.dta", replace

gen mean_temp=.
replace mean_temp= mean_temperature_2000 if dhsyear==2003
replace mean_temp= mean_temperature_2005 if dhsyear==2008
replace mean_temp= mean_temperature_2010 if dhsyear==2010
replace mean_temp= mean_temperature_2010 if dhsyear==2013
replace mean_temp= mean_temperature_2015 if dhsyear==2015
replace mean_temp= mean_temperature_2015 if dhsyear==2018
replace mean_temp= mean_temperature_2020 if dhsyear==2021

gen rainfall=.
replace rainfall= rainfall_2000 if dhsyear==2003
replace rainfall= rainfall_2005 if dhsyear==2008
replace rainfall= rainfall_2010 if dhsyear==2010
replace rainfall= rainfall_2010 if dhsyear==2013
replace rainfall= rainfall_2015 if dhsyear==2015
replace rainfall= rainfall_2015 if dhsyear==2018
replace rainfall= rainfall_2020 if dhsyear==2021

gen travelminutes=travel_times_2000
replace travelminutes=travel_times_2015 if dhsyear==2015
replace travelminutes=travel_times_2015 if dhsyear==2018

/*travel times not present in 2021*/
replace travelminutes=travel_times_2015 if dhsyear==2021

gen veg_index= enhanced_vegetation_index_2015
replace veg_index= enhanced_vegetation_index_2000 if dhsyear==2003
replace veg_index= enhanced_vegetation_index_2005 if dhsyear==2008
replace veg_index= enhanced_vegetation_index_2010 if dhsyear==2010
replace veg_index= enhanced_vegetation_index_2010 if dhsyear==2013
replace veg_index= enhanced_vegetation_index_2020 if dhsyear==2021

gen precipitation= annual_precipitation_2015
replace precipitation= annual_precipitation_2000 if dhsyear==2003
replace precipitation= annual_precipitation_2005 if dhsyear==2008
replace precipitation= annual_precipitation_2010 if dhsyear==2010
replace precipitation= annual_precipitation_2010 if dhsyear==2013

/*annual_precipitation not present in 2021*/
replace precipitation= annual_precipitation_2015 if dhsyear==2021

gen aridity= aridity_2015
replace aridity= aridity_2010 if dhsyear==2013
replace aridity= aridity_2010 if dhsyear==2010
replace aridity= aridity_2005 if dhsyear==2008
replace aridity= aridity_2000 if dhsyear==2003
replace aridity= aridity_2020 if dhsyear==2021


sum mean_temp aridity rainfall travelminutes veg_index precipitation proximity_to_protected_areas proximity_to_water
mvdecode slope mean_temp aridity rainfall travelminutes veg_index precipitation proximity_to_protected_areas proximity_to_water, mv(-9999)
corr slope  mean_temp aridity rainfall travelminutes veg_index precipitation proximity_to_water proximity_to_protected_areas
save "$data\allyr_covariates.dta", replace
keep  dhsyear dhsclust slope precipitation mean_temp rainfall veg_index travelminutes aridity proximity_to_protected_areas proximity_to_water
save "$data\allyr_somecovariates.dta", replace


*** Get Road data
import excel "$data\DHS_clusters.xlsx", sheet("NGGE7BFL.shp") firstrow case(lower) clear
rename near_dist road
save "$data\clusterswithroads.dta", replace

***clean DHS
clear matrix
clear mata
clear
set more off
set maxvar 10000
*2003 has spelling names and codes different from the other years*
use "$data\NGHR4BDT_2003\NGHR4BFL.DTA", clear
*to make the sampling stratification alike*
decode hv024, gen (hv024_1)
replace hv024_1="nc" if hv024==1
replace hv024_1="ne" if hv024==2
replace hv024_1="nw" if hv024==3
replace hv024_1="se" if hv024==4
replace hv024_1="ss" if hv024==5
replace hv024_1="sw" if hv024==6
decode hv025, gen (hv025_1)
decode shstate, gen (shstate_1)
replace shstate_1="zamfara" if shstate_1=="zamfora" 
replace shstate_1="fct abuja" if shstate_1=="abuja (fct)" 
replace shstate_1=lower(shstate_1)
/*encode shstate_1, generate(shstate_2)
bro shstate*
drop shstate
rename shstate_2 shstate*/
gen hv022_1 = hv024_1 +" "+ shstate_1+" "+  hv025_1
encode hv022_1, gen (hv022_2)
bro hv022_2
save "$data\NGHR4BDT_2003\NGHR4BFL_2003amended.dta", replace

***use new 2021 data
use "$data\NGHR81DT_2021\NGHR81FL.DTA", clear
decode  hv024, gen (shstate_1)
tab shstate_1
decode hv025, gen (hv025_1)
save "$data\NGHR81DT_2021\NGHR81FLamended.DTA", replace

use "$data\NGHR7ADT_2018\NGHR7AFL.DTA", clear
append using "$data\NGHR71DT_2015\NGHR71FL.DTA"
append using "$data\NGHR6ADT_2013\NGHR6AFL.DTA"
append using "$data\NGHR61DT_2010\NGHR61FL.DTA"
append using "$data\NGHR53DT_2008\NGHR53FL.DTA"
*to make the sampling stratification alike*
decode hv024, gen (hv024_1)
replace hv024_1="nc" if hv024==1
replace hv024_1="ne" if hv024==2
replace hv024_1="nw" if hv024==3
replace hv024_1="se" if hv024==4
replace hv024_1="ss" if hv024==5
replace hv024_1="sw" if hv024==6
decode hv025, gen (hv025_1)
decode shstate, gen (shstate_1)
replace shstate_1=lower(shstate_1)
gen hv022_1 = hv024_1 +" "+ shstate_1+" "+  hv025_1
encode hv022_1, gen (hv022_2)
bro hv022_2
append using "$data\NGHR4BDT_2003\NGHR4BFL_2003amended.dta"
tab hv022_2 hv007
append using "$data\NGHR81DT_2021\NGHR81FLamended.DTA" 

***generate strata new to accomodate 2021 data***

gen strata = shstate_1+" "+  hv025_1

*recode of household head sex/gender, HH head male(1) female(0)*
tab hv219, missing
recode hv219 (2=0), gen(hheadsex)
label var hheadsex "1=male and 0=female"

*age of household head*
tab hv220 hv007, missing
replace hv220=. if hv220==98
replace hv220=. if hv220==99
generate hheadage=hv220

*number of household members *
tab hv009 hv007, missing 

/**make unique strata values by region/urban-rural (label option automatically labels the results) 
egen strata = group(hv024 hv025), label 
*check results 
tab strata hv007, missing*/

*Recode missing values to '.'
summ hv025 hv026 hv244 hv246 hv247 hv219  hv109_*
*8 is DK (don't know) and 9 is missing-treat as missing*
mvdecode hv025 hv026 hv244 hv246 hv247 hv219  hv109_*, mv(8,9)

summ hv201 hv220 hv246*
*98 is DK (don't know or unknown) and 99 is missing-treat as missing*
mvdecode hv201 hv220 hv246*, mv(98,99)

summ hv204 hv245
*998 is DK (don't know or unknown) and 999 is missing-treat as missing*
mvdecode hv204 hv245, mv(999,998)

*recode water source on premises to 0*
replace hv204=0 if hv204==996 

*HH head education level* not available for 2021 data
gen hheadedu=.
forval j=1/9 {
replace hheadedu=hv109_0`j' if hv101_0`j'==1
}

forval j=10/37 {
replace hheadedu=hv109_`j' if hv101_`j'==1
}

*HH head education years*
mvdecode hv108* , mv(99, 97, 98)
gen hhdedu=.
forval j=1/9 {
replace hhdedu=hv108_0`j' if hv101_0`j'==1
}

forval j=10/43 {
replace hhdedu=hv108_`j' if hv101_`j'==1
}

*Rename some variables for easy identification*
gen timewatersource=hv204
gen under5=hv014 
gen livestock=hv246
gen bank=hv247

*Children Height Weight and Age Information
*Recode missing values to '.'
mvdecode hc6_* hc9_* hc12_*, mv(99998,99999)

*Percentage reference medians for weight/age (hc9_`j'), height/age (hc6_`j'), weight/height (hc12_`j') 
egen avgperc_refwa=rowmean(hc9_*)
egen avgperc_refha=rowmean(hc6_*)
egen avgperc_refwh=rowmean(hc12_*)


*Malaria Hemoglobin, and Anaemia Level.*  
mvdecode hc53_* , mv(994,995, 996, 999)
mvdecode hc56_* , mv(999)
mvdecode hc57_* , mv(9)
mvdecode hml32_* hml35_* , mv(6, 7, 9)

*Observations for malaria and hemoglobin are small by household*
egen avghemoglobin=rowmean (hc53_*)
egen avgmalaria=rowmean (hml32_*)
*only 7804 out of 120244 were matched*
rename hv001 dhsclust
rename hv007 dhsyear
recode hv025 (2=0), gen(urban)
gen wealthind=hv270
gen wealthscore=hv271/100000
xtile wealthind_r=wealthscore, nq(10)
gen hhnumber=hv009
merge m:1 dhsclust dhsyear using "$data\clusters_within_20km.dta"
keep if _merge==3
drop _merge
merge m:1 dhsclust dhsyear using "$data\clusterswithroads.dta"
keep if _merge==3
drop _merge
merge m:1 dhsclust dhsyear using "$data\allyr_somecovariates.dta"
keep if _merge==3
drop _merge

replace  proximity_to_protected_areas= proximity_to_protected_areas/1000
replace proximity_to_water=proximity_to_water/1000
label var treat_20 "1=Household is within 20km of operational power plant and 0=otherwise"
***otherwise means Household is within 20km of non-operational power plant only***

save "$data\newdata062124.dta",replace

clear matrix
clear mata
clear
set more off
set maxvar 10000
use "$data\newdata062124.dta", clear
drop if year>0 & year<2003
drop if year==. & status=="Operating"
replace year=. if year==0



replace shstate_1 = "nasarawa" if shstate_1 == "nassarawa"
replace shstate_1 = "fct abuja" if shstate_1 == "fct"
encode shstate_1, gen(state)
bro state

* Convert the numeric 'state' variable to a string variable with state names.
gen state_str = ""
replace state_str = "Abia" if state == 1
replace state_str = "Adamawa" if state == 2
replace state_str = "Akwa Ibom" if state == 3
replace state_str = "Anambra" if state == 4
replace state_str = "Bauchi" if state == 5
replace state_str = "Bayelsa" if state == 6
replace state_str = "Benue" if state == 7
replace state_str = "Borno" if state == 8
replace state_str = "Cross River" if state == 9
replace state_str = "Delta" if state == 10
replace state_str = "Ebonyi" if state == 11
replace state_str = "Edo" if state == 12
replace state_str = "Ekiti" if state == 13
replace state_str = "Enugu" if state == 14
replace state_str = "FCT Abuja" if state == 15
replace state_str = "Gombe" if state == 16
replace state_str = "Imo" if state == 17
replace state_str = "Jigawa" if state == 18
replace state_str = "Kaduna" if state == 19
replace state_str = "Kano" if state == 20
replace state_str = "Katsina" if state == 21
replace state_str = "Kebbi" if state == 22
replace state_str = "Kogi" if state == 23
replace state_str = "Kwara" if state == 24
replace state_str = "Lagos" if state == 25
replace state_str = "Nasarawa" if state == 26
replace state_str = "Niger" if state == 27
replace state_str = "Ogun" if state == 28
replace state_str = "Ondo" if state == 29
replace state_str = "Osun" if state == 30
replace state_str = "Oyo" if state == 31
replace state_str = "Plateau" if state == 32
replace state_str = "Rivers" if state == 33
replace state_str = "Sokoto" if state == 34
replace state_str = "Taraba" if state == 35
replace state_str = "Yobe" if state == 36
replace state_str = "Zamfara" if state == 37

* Create a new variable for the geopolitical zones
gen geo_zone = ""

* Assign states to their respective geopolitical zones

* South East
replace geo_zone = "South East" if inlist(state_str, "Abia", "Anambra", "Ebonyi", "Enugu", "Imo")

* South South
replace geo_zone = "South South" if inlist(state_str, "Akwa Ibom", "Bayelsa", "Cross River", "Delta", "Edo", "Rivers")

* South West
replace geo_zone = "South West" if inlist(state_str, "Ekiti", "Lagos", "Ogun", "Ondo", "Osun", "Oyo")

* North Central
replace geo_zone = "North Central" if inlist(state_str, "Benue", "Kogi", "Kwara", "Nasarawa", "Niger", "Plateau", "FCT Abuja")

* North East
replace geo_zone = "North East" if inlist(state_str, "Adamawa", "Bauchi", "Borno", "Gombe", "Taraba", "Yobe")

* North West
replace geo_zone = "North West" if inlist(state_str, "Jigawa", "Kaduna", "Kano", "Katsina", "Kebbi", "Sokoto", "Zamfara")

* Check if all states are correctly assigned
tab state geo_zone

* Create a new variable for the geopolitical zones that is encoded
encode geo_zone, generate(geo_zone_n)


gen timeToTreat=dhsyear-year
replace timeToTreat=. if treat_20==0

*separate the actual controls from households near power plants who is active but were non-operational as at the time survey was conducted.
gen intervention = 0
replace intervention=1 if treat_20==1 & timeToTreat>=0


bro state
xtset state

***Bin the years***
recode year (2004/2007=2008) (2009=2010) (2011/2012=2013) (2014=2015) (2016/2017=2018) (2019/2020=2021), gen(newyear)
recode newyear (2003=1) (2008=2) (2010=3) (2013=4) (2015=5) (2018=6) (2021=7), gen(pyear)
recode dhsyear (2003=1) (2008=2) (2010=3) (2013=4) (2015=5) (2018=6) (2021=7), gen(syear)
gen treattime=syear-pyear
replace treattime=. if treat_20==0




* Generate new variables for road and average vegetable index
gen road_n=road/1000
gen veg_index_n=veg_index/1000

save "$data\newdata_matching.dta", replace

